smallRNA Differential Expression

Author

Lance Parsons

Load libraries

This project uses renv to keep track of installed packages. Install renv if not installed and load dependencies with renv::restore().

install.packages("renv")
renv::restore()

Read sample data

Get list of samples

Code
samples <- read_tsv("config/samples.tsv",
  col_types = list(
    sample_name = col_character(),
    cell_line = col_factor(),
    exogenous_rna = col_factor(),
    day = col_factor()
  )
)
units <- read_tsv("config/units.tsv",
  col_types = list(
    sample_name = col_character(),
    unit_name = col_character(),
    fq1 = col_character(),
    fq2 = col_character()
  )
)
sample_units <- dplyr::left_join(samples, units, by = "sample_name") %>%
  unite(sample_unit, sample_name, unit_name, remove = FALSE) %>%
  mutate(
    day = as.factor(paste0("day", day)),
    batch = as.factor(case_when(
      exogenous_rna == "mastermix1" ~ "batch3",
      exogenous_rna == "mastermix2" ~ "batch3",
      TRUE ~ "batch2"
    )),
    .before = cell_line,
  )

sample_units

Table of exogenous RNA mixtures

Code
# Exogenous RNA mixtures
rna_mixes <- tibble()
for (mix in c("mastermix1", "mastermix2", "PJY103_mDNMT1", "PJY300_mDNMT1")) {
  t <- Biostrings::readDNAStringSet(sprintf("data/references/%s.fa", mix))
  rna_mixes <- rbind(rna_mixes, tibble(
    exogenous_rna = mix,
    rna_species = word(t@ranges@NAMES, 1),
    start = 1,
    end = t@ranges@width
  ))
}

rna_mixes <- rna_mixes %>%
  mutate(
    active_cis_start_max = case_when(
      exogenous_rna == "PJY103_mDNMT1" ~ 16,
      exogenous_rna == "PJY300_mDNMT1" ~ 16,
      exogenous_rna == "mastermix1" ~ 15,
      exogenous_rna == "mastermix2" ~ 15
    ),
    active_cis_end_min = case_when(
      exogenous_rna == "PJY103_mDNMT1" ~ 74,
      exogenous_rna == "PJY300_mDNMT1" ~ 74,
      exogenous_rna == "mastermix1" ~ 73,
      exogenous_rna == "mastermix2" ~ 73
    ),
    inactive_ct_end_max = case_when(
      exogenous_rna == "PJY103_mDNMT1" ~ 38,
      exogenous_rna == "PJY300_mDNMT1" ~ 38,
      exogenous_rna == "mastermix1" ~ 37,
      exogenous_rna == "mastermix2" ~ 37
    ),
    active_trans_start_max = case_when(
      rna_species == "PJY103_mDNMT1" ~ 115,
      rna_species == "PJY300_mDNMT1" ~ 115,
      rna_species == "PJY179_FANCF" ~ 119,
      rna_species == "PJY181_HEK3" ~ 120,
      rna_species == "PJY182_HEK3" ~ 120,
      rna_species == "PJY183_DNMT1" ~ 113,
      rna_species == "PJY184_DNMT1" ~ 113,
      rna_species == "PJY186_RUNX1" ~ 117,
      rna_species == "PJY185_RUNX1" ~ 117,
      rna_species == "PJY187_VEGFA" ~ 124,
      rna_species == "PJY306_EMX1" ~ 118,
      rna_species == "PJY302_EMX1" ~ 118,
      rna_species == "PJY177_RNF2" ~ 120
    ),
    active_trans_end_min = case_when(
      rna_species == "PJY103_mDNMT1" ~ 121,
      rna_species == "PJY300_mDNMT1" ~ 121,
      rna_species == "PJY179_FANCF" ~ 124,
      rna_species == "PJY181_HEK3" ~ 121,
      rna_species == "PJY182_HEK3" ~ 121,
      rna_species == "PJY183_DNMT1" ~ 118,
      rna_species == "PJY184_DNMT1" ~ 118,
      rna_species == "PJY186_RUNX1" ~ 122,
      rna_species == "PJY185_RUNX1" ~ 122,
      rna_species == "PJY187_VEGFA" ~ 129,
      rna_species == "PJY306_EMX1" ~ 123,
      rna_species == "PJY302_EMX1" ~ 123,
      rna_species == "PJY177_RNF2" ~ 121
    ),
  )

rna_mixes

Exogenous RNA counts

BAM reading functions

Code
rna_species_plot_range <- function(sequence_name) {
  plot_range <- rna_mixes %>%
    dplyr::filter(.data$rna_species == sequence_name) %>%
    dplyr::select(start, end) %>%
    unique()
  return(plot_range)
}

# GRanges from BAM
granges_from_bam <- function(sample_unit,
                             sequence_name,
                             is_proper_pair = TRUE,
                             bam_dir =
                               "results/alignments/exogenous_rna/sorted") {
  plot_range <- rna_species_plot_range(sequence_name)
  which <- GRanges(
    sprintf("%s:%i-%i", sequence_name, plot_range$start, plot_range$end)
  )

  param <- ScanBamParam(
    flag = scanBamFlag(isProperPair = is_proper_pair),
    mapqFilter = 1,
    which = which
  )

  aligned_fragments_list <- list()

  if (is_proper_pair) {
    aligned_fragments <- granges(
      readGAlignmentPairs(
        sprintf("%s/%s.bam", bam_dir, sample_unit),
        param = param
      ),
      on.discordant.seqnames = "drop"
    )
    rna_info <- rna_mixes %>%
      dplyr::filter(rna_species == {{ sequence_name }}) %>%
      dplyr::select(-"exogenous_rna") %>%
      unique()

    # Active in cis
    aligned_fragments_list$active_cis <- aligned_fragments %>%
      plyranges::filter(
        start <= rna_info$active_cis_start_max,
        end >= rna_info$active_cis_end_min
      )

    # Active in trans
    aligned_fragments_list$active_trans <- aligned_fragments %>%
      plyranges::filter(
        start > rna_info$active_cis_start_max,
        start <= rna_info$active_trans_start_max,
        end >= rna_info$active_trans_end_min
      )

    # Inactive Cryptic Termination
    aligned_fragments_list$inactive_cryptic_terminiation <-
      aligned_fragments %>%
      plyranges::filter(end < rna_info$inactive_ct_end_max)

    # Inactive Other
    aligned_fragments_list$inactive_other <- plyranges::bind_ranges(
      aligned_fragments %>%
        plyranges::filter(
          start <= rna_info$active_cis_start_max,
          end < rna_info$active_cis_end_min,
          end > rna_info$inactive_ct_end_max
        ),
      aligned_fragments %>%
        plyranges::filter(
          start > rna_info$active_cis_start_max,
          (
            start > rna_info$active_trans_start_max |
              end < rna_info$active_trans_end_min
          )
        ),
    )
  } else {
    aligned_fragments <- granges(
      readGAlignments(
        sprintf("%s/%s.bam", bam_dir, sample_unit),
        param = param
      )
    )
    aligned_fragments_list$discordant <- aligned_fragments
  }
  return(aligned_fragments_list)
}

Read BAM coverage

Code
rna_species_plot_data <- list()
for (rna_species_to_plot in rna_mixes$rna_species) {
  rna_info <- rna_mixes %>%
    dplyr::filter(rna_species == rna_species_to_plot)

  sample_units_to_plot <- sample_units %>%
    dplyr::filter(exogenous_rna %in% rna_info$exogenous_rna)

  granges_to_plot <- list()
  for (sample_unit in sample_units_to_plot$sample_unit) {
    granges_to_plot[[sample_unit]] <- granges_from_bam(
      sample_unit,
      rna_species_to_plot,
      TRUE
    )
  }
  rna_species_plot_data[[rna_species_to_plot]] <- granges_to_plot
}

Organize coverage data

Code
exogenous_rna_count_data <- tibble(
  sample_unit = character(),
  sequence_name = character(),
  category = character(),
  count = numeric()
)
for (rna_species in names(rna_species_plot_data)) {
  for (sample_unit in names(rna_species_plot_data[[rna_species]])) {
    for (category in
      names(rna_species_plot_data[[rna_species]][[sample_unit]])) {
      exogenous_rna_count_data <- exogenous_rna_count_data %>%
        add_row(
          sample_unit = sample_unit,
          sequence_name = rna_species,
          category = category,
          count = length(
            rna_species_plot_data[[rna_species]][[sample_unit]][[category]]
          )
        )
    }
  }
}
exogenous_rna_count_data

Summarize coverage data

Code
exogenous_rna_mapped_totals <- exogenous_rna_count_data %>%
  group_by(sample_unit, sequence_name) %>%
  summarize(mapped_fragments = sum(count))
`summarise()` has grouped output by 'sample_unit'. You can override using the
`.groups` argument.
Code
exogenous_rna_mapped_totals

Human small RNA gene counts

  • count only fragments that were properly aligned
  • annotate with GENCODE gene model
  • primary alignments were counted, even if the fragments aligned multiple times
  • fragments aligning to multiple features were assigned to the feature that mostly closely overlapped with the fragment
  • exogenous RNA counts are total fragments that aligned
Code
human_counts_dir <- "results/smrna_count/"
human_gene_counts_files <- paste0(
  human_counts_dir,
  sample_units$sample_unit,
  "_first_proper_pair_gene_count.txt"
)

human_gene_counts <- readr::read_tsv(
  human_gene_counts_files[1],
  comment = "#",
  col_names = c("gene", human_gene_counts_files[1]),
  col_types = "ci"
)

for (i in 2:length(human_gene_counts_files)) {
  gene_sample <-
    readr::read_tsv(
      human_gene_counts_files[i],
      comment = "#",
      col_names = c("gene", human_gene_counts_files[i]),
      col_types = "ci"
    )
  human_gene_counts <- human_gene_counts %>%
    dplyr::full_join(gene_sample, by = "gene")
}

human_gene_counts <- human_gene_counts %>%
  rename_all(~ stringr::str_replace_all(
    ., human_counts_dir, ""
  )) %>%
  rename_all(~ str_replace_all(., "_first_proper_pair_gene_count.txt", ""))

exogenous_gene_counts <- exogenous_rna_count_data %>%
  tidyr::unite(gene, c(sequence_name, category), sep = ":") %>%
  tidyr::spread(sample_unit, count)

gene_counts <- rbind(human_gene_counts, exogenous_gene_counts)

gene_counts

Exogenous RNA “gene” names

Code
# List of exogenous genes to highlight
exogenous_rna_names <- gene_counts %>%
  dplyr::filter(str_detect(gene, "^PJY")) %>%
  pull(gene)
exogenous_rna_names
 [1] "PJY103_mDNMT1:active_cis"                   
 [2] "PJY103_mDNMT1:active_trans"                 
 [3] "PJY103_mDNMT1:inactive_cryptic_terminiation"
 [4] "PJY103_mDNMT1:inactive_other"               
 [5] "PJY177_RNF2:active_cis"                     
 [6] "PJY177_RNF2:active_trans"                   
 [7] "PJY177_RNF2:inactive_cryptic_terminiation"  
 [8] "PJY177_RNF2:inactive_other"                 
 [9] "PJY179_FANCF:active_cis"                    
[10] "PJY179_FANCF:active_trans"                  
[11] "PJY179_FANCF:inactive_cryptic_terminiation" 
[12] "PJY179_FANCF:inactive_other"                
[13] "PJY181_HEK3:active_cis"                     
[14] "PJY181_HEK3:active_trans"                   
[15] "PJY181_HEK3:inactive_cryptic_terminiation"  
[16] "PJY181_HEK3:inactive_other"                 
[17] "PJY182_HEK3:active_cis"                     
[18] "PJY182_HEK3:active_trans"                   
[19] "PJY182_HEK3:inactive_cryptic_terminiation"  
[20] "PJY182_HEK3:inactive_other"                 
[21] "PJY183_DNMT1:active_cis"                    
[22] "PJY183_DNMT1:active_trans"                  
[23] "PJY183_DNMT1:inactive_cryptic_terminiation" 
[24] "PJY183_DNMT1:inactive_other"                
[25] "PJY184_DNMT1:active_cis"                    
[26] "PJY184_DNMT1:active_trans"                  
[27] "PJY184_DNMT1:inactive_cryptic_terminiation" 
[28] "PJY184_DNMT1:inactive_other"                
[29] "PJY185_RUNX1:active_cis"                    
[30] "PJY185_RUNX1:active_trans"                  
[31] "PJY185_RUNX1:inactive_cryptic_terminiation" 
[32] "PJY185_RUNX1:inactive_other"                
[33] "PJY186_RUNX1:active_cis"                    
[34] "PJY186_RUNX1:active_trans"                  
[35] "PJY186_RUNX1:inactive_cryptic_terminiation" 
[36] "PJY186_RUNX1:inactive_other"                
[37] "PJY187_VEGFA:active_cis"                    
[38] "PJY187_VEGFA:active_trans"                  
[39] "PJY187_VEGFA:inactive_cryptic_terminiation" 
[40] "PJY187_VEGFA:inactive_other"                
[41] "PJY300_mDNMT1:active_cis"                   
[42] "PJY300_mDNMT1:active_trans"                 
[43] "PJY300_mDNMT1:inactive_cryptic_terminiation"
[44] "PJY300_mDNMT1:inactive_other"               
[45] "PJY302_EMX1:active_cis"                     
[46] "PJY302_EMX1:active_trans"                   
[47] "PJY302_EMX1:inactive_cryptic_terminiation"  
[48] "PJY302_EMX1:inactive_other"                 
[49] "PJY306_EMX1:active_cis"                     
[50] "PJY306_EMX1:active_trans"                   
[51] "PJY306_EMX1:inactive_cryptic_terminiation"  
[52] "PJY306_EMX1:inactive_other"                 

Import sample metadata and counts into DESeq2

Read these counts into DESeq2 along with the sample metadata.

Set the design to ~ exogenous_rna + day + cell_line.

Code
dds <- DESeqDataSetFromMatrix(
  countData = gene_counts %>%
    tibble::column_to_rownames("gene") %>%
    replace(is.na(.), 0),
  colData = sample_units,
  design = ~ exogenous_rna + day + cell_line
)
converting counts to integer mode
Code
dds
class: DESeqDataSet 
dim: 56739 56 
metadata(1): version
assays(1): counts
rownames(56739): GAS5 SNORD30 ...
  PJY306_EMX1:inactive_cryptic_terminiation PJY306_EMX1:inactive_other
rowData names(0):
colnames(56): Parental_mastermix1_day1_rep1
  Parental_mastermix1_day1_rep2 ... Parental_PJY300_epegRNA_day2_rep3
  Parental_PJY300_epegRNA_day2_rep4
colData names(9): sample_unit sample_name ... fq1 fq2

Sample comparisons

Variance Stabalized Transformation

Code
vsd <- vst(dds, blind = FALSE)

Heatmap of sample-to-sample distances

Code
sample_dists <- dist(t(assay(vsd)))

sample_dist_matrix <- as.matrix(sample_dists)
rownames(sample_dist_matrix) <- paste(vsd$cell_line,
  vsd$exogenous_rna,
  vsd$day,
  sep = "-"
)
colnames(sample_dist_matrix) <- NULL
Code
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sample_dist_matrix,
  clustering_distance_rows = sample_dists,
  clustering_distance_cols = sample_dists,
  col = colors
)

PCA plot of samples

Code
plotPCA(vsd, intgroup = c("cell_line", "exogenous_rna", "day"))

Differential Expression

Batch 2 - Day 1

Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental

Code
dds_batch2_day1 <- subset(dds, select = colData(dds)$batch == "batch2" &
  colData(dds)$day == "day1")
dds_batch2_day1$exogenous_rna <- droplevels(dds_batch2_day1$exogenous_rna)
dds_batch2_day1$day <- droplevels(dds_batch2_day1$day)
design(dds_batch2_day1) <- ~ exogenous_rna + cell_line
dds_batch2_day1 <- DESeq(dds_batch2_day1, parallel = TRUE)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
dds_batch2_day1
class: DESeqDataSet 
dim: 56739 16 
metadata(1): version
assays(4): counts mu H cooks
rownames(56739): GAS5 SNORD30 ...
  PJY306_EMX1:inactive_cryptic_terminiation PJY306_EMX1:inactive_other
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(16): P1E10_sorted_PJY103_pegRNA_day1_rep1
  P1E10_sorted_PJY103_pegRNA_day1_rep2 ...
  Parental_PJY300_epegRNA_day1_rep3 Parental_PJY300_epegRNA_day1_rep4
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch2_day1 <- results(dds_batch2_day1, alpha = 0.05)
res_batch2_day1
log2 fold change (MLE): cell line P1E10 vs Parental 
Wald test p-value: cell line P1E10 vs Parental 
DataFrame with 56739 rows and 6 columns
                                           baseMean log2FoldChange     lfcSE
                                          <numeric>      <numeric> <numeric>
GAS5                                        3358381      0.1558718 0.0336348
SNORD30                                     1605167      0.0910242 0.0577644
SNORD49A                                    1156164      0.1842127 0.0437422
SNORD26                                      857547      0.0954226 0.0541020
SNHG1                                        780897     -0.0217632 0.0401304
...                                             ...            ...       ...
PJY302_EMX1:inactive_other                        0             NA        NA
PJY306_EMX1:active_cis                            0             NA        NA
PJY306_EMX1:active_trans                          0             NA        NA
PJY306_EMX1:inactive_cryptic_terminiation         0             NA        NA
PJY306_EMX1:inactive_other                        0             NA        NA
                                               stat      pvalue        padj
                                          <numeric>   <numeric>   <numeric>
GAS5                                       4.634240 3.58252e-06 8.13534e-05
SNORD30                                    1.575783 1.15076e-01 4.53193e-01
SNORD49A                                   4.211322 2.53881e-05 4.97130e-04
SNORD26                                    1.763753 7.77736e-02 3.62974e-01
SNHG1                                     -0.542311 5.87604e-01 8.80000e-01
...                                             ...         ...         ...
PJY302_EMX1:inactive_other                       NA          NA          NA
PJY306_EMX1:active_cis                           NA          NA          NA
PJY306_EMX1:active_trans                         NA          NA          NA
PJY306_EMX1:inactive_cryptic_terminiation        NA          NA          NA
PJY306_EMX1:inactive_other                       NA          NA          NA
Code
summary(res_batch2_day1)

out of 51623 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 936, 1.8%
LFC < 0 (down)     : 1750, 3.4%
outliers [1]       : 1, 0.0019%
low counts [2]     : 23895, 46%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Log fold change shrinkage

Run log fold change shrinkage to account for low counts.

Code
res_batch2_day1_lfc <- lfcShrink(dds_batch2_day1,
  coef = "cell_line_P1E10_vs_Parental",
  type = "apeglm", parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

MA Plot (lfc shrunk)

MA Plot of shrunken log2 fold changes

Code
res_batch2_day1_lfc_df <- res_batch2_day1_lfc %>%
  as.data.frame() %>%
  rownames_to_column("gene") %>%
  dplyr::mutate(significant = padj < 0.05)

write_tsv(res_batch2_day1_lfc_df, file = "diffexp_results_batch2_day1.tsv")

res_batch2_day1_lfc_labelled <- res_batch2_day1_lfc_df %>%
  dplyr::filter(gene %in% exogenous_rna_names) %>%
  na.omit()

ggplot(
  res_batch2_day1_lfc_df,
  aes(x = log2(baseMean), y = log2FoldChange, colour = significant)
) +
  geom_point() +
  ggrepel::geom_label_repel(
    data = res_batch2_day1_lfc_labelled,
    aes(label = gene, segment.colour = "black"),
    min.segment.length = 0,
    show.legend = FALSE
  ) +
  theme_bw()
Warning: Removed 5116 rows containing missing values (geom_point).

Counts of exogenous rna

Plot of counts for a single gene (with lowest adjusted p-value)

Code
plot_list <- list()
for (gene in res_batch2_day1_lfc_labelled$gene) {
  gene_prefix <- str_split(gene, "_")[[1]][1]
  d <- plotCounts(dds_batch2_day1,
    gene = gene,
    intgroup = "cell_line",
    returnData = TRUE
  )
  p <- ggplot(
    d %>%
      rownames_to_column("sample_unit") %>%
      dplyr::filter(grepl(gene_prefix, sample_unit)),
    aes(x = cell_line, y = count)
  ) +
    geom_point(position = position_jitter(width = 0.1, height = 0)) +
    ggtitle(gene)
  plot_list[[gene]] <- p
}
cowplot::plot_grid(plotlist = plot_list)

Batch 2 - Day 2

Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental

Code
dds_batch2_day2 <- subset(dds, select = colData(dds)$batch == "batch2" &
  colData(dds)$day == "day2")
dds_batch2_day2$exogenous_rna <- droplevels(dds_batch2_day2$exogenous_rna)
dds_batch2_day2$day <- droplevels(dds_batch2_day2$day)
design(dds_batch2_day2) <- ~ exogenous_rna + cell_line
dds_batch2_day2 <- DESeq(dds_batch2_day2, parallel = TRUE)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
dds_batch2_day2
class: DESeqDataSet 
dim: 56739 16 
metadata(1): version
assays(4): counts mu H cooks
rownames(56739): GAS5 SNORD30 ...
  PJY306_EMX1:inactive_cryptic_terminiation PJY306_EMX1:inactive_other
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(16): P1E10_sorted_PJY103_pegRNA_day2_rep1
  P1E10_sorted_PJY103_pegRNA_day2_rep2 ...
  Parental_PJY300_epegRNA_day2_rep3 Parental_PJY300_epegRNA_day2_rep4
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch2_day2 <- results(dds_batch2_day2, alpha = 0.05)
res_batch2_day2
log2 fold change (MLE): cell line P1E10 vs Parental 
Wald test p-value: cell line P1E10 vs Parental 
DataFrame with 56739 rows and 6 columns
                                           baseMean log2FoldChange     lfcSE
                                          <numeric>      <numeric> <numeric>
GAS5                                        3192415     0.18567762 0.0303372
SNORD30                                     1514330     0.15582664 0.0512420
SNORD49A                                    1170290     0.19302856 0.0428279
SNORD26                                      825436     0.09512922 0.0398450
SNHG1                                        678111     0.00437696 0.0311644
...                                             ...            ...       ...
PJY302_EMX1:inactive_other                        0             NA        NA
PJY306_EMX1:active_cis                            0             NA        NA
PJY306_EMX1:active_trans                          0             NA        NA
PJY306_EMX1:inactive_cryptic_terminiation         0             NA        NA
PJY306_EMX1:inactive_other                        0             NA        NA
                                               stat      pvalue        padj
                                          <numeric>   <numeric>   <numeric>
GAS5                                       6.120450 9.33112e-10 3.29420e-08
SNORD30                                    3.040992 2.35800e-03 2.71635e-02
SNORD49A                                   4.507080 6.57259e-06 1.38019e-04
SNORD26                                    2.387479 1.69644e-02 1.31205e-01
SNHG1                                      0.140447 8.88307e-01 9.76140e-01
...                                             ...         ...         ...
PJY302_EMX1:inactive_other                       NA          NA          NA
PJY306_EMX1:active_cis                           NA          NA          NA
PJY306_EMX1:active_trans                         NA          NA          NA
PJY306_EMX1:inactive_cryptic_terminiation        NA          NA          NA
PJY306_EMX1:inactive_other                       NA          NA          NA
Code
summary(res_batch2_day2)

out of 50456 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 879, 1.7%
LFC < 0 (down)     : 1598, 3.2%
outliers [1]       : 53, 0.11%
low counts [2]     : 25267, 50%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Log fold change shrinkage

Run log fold change shrinkage to account for low counts.

Code
res_batch2_day2_lfc <- lfcShrink(dds_batch2_day2,
  coef = "cell_line_P1E10_vs_Parental",
  type = "apeglm", parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

MA Plot (lfc shrunk)

MA Plot of shrunken log2 fold changes

Code
res_batch2_day2_lfc_df <- res_batch2_day2_lfc %>%
  as.data.frame() %>%
  rownames_to_column("gene") %>%
  dplyr::mutate(significant = padj < 0.05)

write_tsv(res_batch2_day2_lfc_df, file = "diffexp_results_batch2_day2.tsv")

res_batch2_day2_lfc_labelled <- res_batch2_day2_lfc_df %>%
  dplyr::filter(gene %in% exogenous_rna_names) %>%
  na.omit()

ggplot(
  res_batch2_day2_lfc_df,
  aes(x = log2(baseMean), y = log2FoldChange, color = significant)
) +
  geom_point() +
  ggrepel::geom_label_repel(
    data = res_batch2_day2_lfc_labelled,
    aes(label = gene, segment.colour = "black"),
    min.segment.length = 0,
    show.legend = FALSE
  ) +
  theme_bw()
Warning: Removed 6283 rows containing missing values (geom_point).

Counts of exogenous rna

Plot of counts for a single gene (with lowest adjusted p-value)

Code
plot_list <- list()
for (gene in res_batch2_day2_lfc_labelled$gene) {
  gene_prefix <- str_split(gene, "_")[[1]][1]
  d <- plotCounts(dds_batch2_day2,
    gene = gene,
    intgroup = "cell_line",
    returnData = TRUE
  )
  p <- ggplot(
    d %>%
      rownames_to_column("sample_unit") %>%
      dplyr::filter(grepl(gene_prefix, sample_unit)),
    aes(x = cell_line, y = count)
  ) +
    geom_point(position = position_jitter(width = 0.1, height = 0)) +
    ggtitle(gene)
  plot_list[[gene]] <- p
}
cowplot::plot_grid(plotlist = plot_list)

Batch 3 - Day 1

Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental

Code
dds_batch3_day1 <- subset(dds, select = colData(dds)$batch == "batch3" &
  colData(dds)$day == "day1")
dds_batch3_day1$exogenous_rna <- droplevels(dds_batch3_day1$exogenous_rna)
dds_batch3_day1$day <- droplevels(dds_batch3_day1$day)
design(dds_batch3_day1) <- ~ exogenous_rna + cell_line
dds_batch3_day1 <- DESeq(dds_batch3_day1, parallel = TRUE)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
dds_batch3_day1
class: DESeqDataSet 
dim: 56739 12 
metadata(1): version
assays(4): counts mu H cooks
rownames(56739): GAS5 SNORD30 ...
  PJY306_EMX1:inactive_cryptic_terminiation PJY306_EMX1:inactive_other
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(12): Parental_mastermix1_day1_rep1
  Parental_mastermix1_day1_rep2 ... P1E10_sorted_mastermix2_day1_rep2
  P1E10_sorted_mastermix2_day1_rep3
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch3_day1 <- results(dds_batch3_day1, alpha = 0.05)
res_batch3_day1
log2 fold change (MLE): cell line P1E10 vs Parental 
Wald test p-value: cell line P1E10 vs Parental 
DataFrame with 56739 rows and 6 columns
                                           baseMean log2FoldChange     lfcSE
                                          <numeric>      <numeric> <numeric>
GAS5                                        3961074       0.244629 0.0339217
SNORD30                                     1821939       0.259129 0.0384136
SNORD49A                                    1512022       0.249914 0.0373254
SNORD26                                      978999       0.216320 0.0334087
SNHG1                                        822971       0.013230 0.0371898
...                                             ...            ...       ...
PJY302_EMX1:inactive_other                3288.7113     -1.0100853 0.0747537
PJY306_EMX1:active_cis                    3219.2788      0.8272604 0.0700896
PJY306_EMX1:active_trans                    66.5829     -1.1368624 0.1815416
PJY306_EMX1:inactive_cryptic_terminiation  101.0213     -1.2113201 0.1573057
PJY306_EMX1:inactive_other                 520.8476      0.0349175 0.0776916
                                                stat      pvalue        padj
                                           <numeric>   <numeric>   <numeric>
GAS5                                        7.211565 5.53126e-13 2.04824e-11
SNORD30                                     6.745745 1.52243e-11 4.98325e-10
SNORD49A                                    6.695550 2.14862e-11 6.93996e-10
SNORD26                                     6.474951 9.48428e-11 2.84995e-09
SNHG1                                       0.355742 7.22034e-01 9.28167e-01
...                                              ...         ...         ...
PJY302_EMX1:inactive_other                -13.512178 1.32541e-41 1.89310e-39
PJY306_EMX1:active_cis                     11.802892 3.77123e-32 3.85823e-30
PJY306_EMX1:active_trans                   -6.262271 3.79412e-10 1.07487e-08
PJY306_EMX1:inactive_cryptic_terminiation  -7.700421 1.35618e-14 5.66045e-13
PJY306_EMX1:inactive_other                  0.449437 6.53116e-01 9.07399e-01
Code
summary(res_batch3_day1)

out of 49779 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 865, 1.7%
LFC < 0 (down)     : 1742, 3.5%
outliers [1]       : 1, 0.002%
low counts [2]     : 27782, 56%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Log fold change shrinkage

Run log fold change shrinkage to account for low counts.

Code
res_batch3_day1_lfc <- lfcShrink(dds_batch3_day1,
  coef = "cell_line_P1E10_vs_Parental",
  type = "apeglm", parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

MA Plot (lfc shrunk)

MA Plot of shrunken log2 fold changes

Code
res_batch3_day1_lfc_df <- res_batch3_day1_lfc %>%
  as.data.frame() %>%
  rownames_to_column("gene") %>%
  dplyr::mutate(significant = padj < 0.05)

write_tsv(res_batch3_day1_lfc_df, file = "diffexp_results_batch3_day1.tsv")

res_batch3_day1_lfc_labelled <- res_batch3_day1_lfc_df %>%
  dplyr::filter(gene %in% exogenous_rna_names) %>%
  na.omit()

ggplot(
  res_batch3_day1_lfc_df,
  aes(x = log2(baseMean), y = log2FoldChange, colour = significant)
) +
  geom_point() +
  ggrepel::geom_label_repel(
    data = res_batch3_day1_lfc_labelled,
    aes(label = gene, segment.colour = "black"),
    min.segment.length = 0,
    max.overlaps = 30,
    size = 2.5,
    show.legend = FALSE
  ) +
  theme_bw()
Warning: Removed 6960 rows containing missing values (geom_point).

Counts of exogenous rna

Plot of counts for a single gene (with lowest adjusted p-value)

Code
plot_list <- list()
for (gene in res_batch3_day1_lfc_labelled$gene) {
  gene_rna_type <- str_split(gene, ":")[[1]][1]
  exogenous_rna <- rna_mixes %>%
    dplyr::filter(rna_species == gene_rna_type) %>%
    pull(exogenous_rna)
  exogenous_rna_regex <- paste(exogenous_rna, collapse = "|")

  d <- plotCounts(dds_batch3_day1,
    gene = gene,
    intgroup = "cell_line",
    returnData = TRUE
  )
  p <- ggplot(
    d %>%
      rownames_to_column("sample_unit") %>%
      dplyr::filter(grepl(exogenous_rna_regex, sample_unit)),
    aes(x = cell_line, y = count)
  ) +
    geom_point(position = position_jitter(width = 0.1, height = 0)) +
    ggtitle(gene)
  plot_list[[gene]] <- p
}
cowplot::plot_grid(plotlist = plot_list, ncol = 3)

Batch 3 - Day 2

Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental

Code
dds_batch3_day2 <- subset(dds, select = colData(dds)$batch == "batch3" &
  colData(dds)$day == "day2")
dds_batch3_day2$exogenous_rna <- droplevels(dds_batch3_day2$exogenous_rna)
dds_batch3_day2$day <- droplevels(dds_batch3_day2$day)
design(dds_batch3_day2) <- ~ exogenous_rna + cell_line
dds_batch3_day2 <- DESeq(dds_batch3_day2, parallel = TRUE)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
dds_batch3_day2
class: DESeqDataSet 
dim: 56739 12 
metadata(1): version
assays(4): counts mu H cooks
rownames(56739): GAS5 SNORD30 ...
  PJY306_EMX1:inactive_cryptic_terminiation PJY306_EMX1:inactive_other
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(12): Parental_mastermix1_day2_rep1
  Parental_mastermix1_day2_rep2 ... P1E10_sorted_mastermix2_day2_rep2
  P1E10_sorted_mastermix2_day2_rep3
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch3_day2 <- results(dds_batch3_day2, alpha = 0.05)
res_batch3_day2
log2 fold change (MLE): cell line P1E10 vs Parental 
Wald test p-value: cell line P1E10 vs Parental 
DataFrame with 56739 rows and 6 columns
                                           baseMean log2FoldChange     lfcSE
                                          <numeric>      <numeric> <numeric>
GAS5                                        4358516      0.3665465 0.0345032
SNORD30                                     2397544      0.3290442 0.0411835
SNORD49A                                    1753255      0.4151539 0.0481346
SNORD26                                     1116921      0.2293532 0.0408765
SNHG1                                        818060      0.0379035 0.0420720
...                                             ...            ...       ...
PJY302_EMX1:inactive_other                4936.7925      -0.727201 0.1027480
PJY306_EMX1:active_cis                    5015.0092       0.834158 0.0890741
PJY306_EMX1:active_trans                    79.5176      -0.985936 0.1874460
PJY306_EMX1:inactive_cryptic_terminiation   88.0663      -0.468875 0.1848268
PJY306_EMX1:inactive_other                 641.0085       0.192428 0.0907099
                                               stat      pvalue        padj
                                          <numeric>   <numeric>   <numeric>
GAS5                                      10.623543 2.31601e-26 2.04618e-24
SNORD30                                    7.989717 1.35249e-15 6.09786e-14
SNORD49A                                   8.624861 6.41704e-18 3.47315e-16
SNORD26                                    5.610887 2.01292e-08 4.93093e-07
SNHG1                                      0.900919 3.67631e-01 7.16392e-01
...                                             ...         ...         ...
PJY302_EMX1:inactive_other                 -7.07751 1.46763e-12 5.33564e-11
PJY306_EMX1:active_cis                      9.36476 7.62204e-21 5.14509e-19
PJY306_EMX1:active_trans                   -5.25984 1.44181e-07 3.11865e-06
PJY306_EMX1:inactive_cryptic_terminiation  -2.53684 1.11859e-02 7.42360e-02
PJY306_EMX1:inactive_other                  2.12135 3.38921e-02 1.72190e-01
Code
summary(res_batch3_day2)

out of 50054 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 1170, 2.3%
LFC < 0 (down)     : 2135, 4.3%
outliers [1]       : 6, 0.012%
low counts [2]     : 26017, 52%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Log fold change shrinkage

Run log fold change shrinkage to account for low counts.

Code
res_batch3_day2_lfc <- lfcShrink(dds_batch3_day2,
  coef = "cell_line_P1E10_vs_Parental",
  type = "apeglm", parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

MA Plot (lfc shrunk)

MA Plot of shrunken log2 fold changes

Code
res_batch3_day2_lfc_df <- res_batch3_day2_lfc %>%
  as.data.frame() %>%
  rownames_to_column("gene") %>%
  dplyr::mutate(significant = padj < 0.05)

write_tsv(res_batch3_day2_lfc_df, file = "diffexp_results_batch3_day2.tsv")

res_batch3_day2_lfc_labelled <- res_batch3_day2_lfc_df %>%
  dplyr::filter(gene %in% exogenous_rna_names) %>%
  na.omit()

ggplot(
  res_batch3_day2_lfc_df,
  aes(x = log2(baseMean), y = log2FoldChange, color = significant)
) +
  geom_point() +
  ggrepel::geom_label_repel(
    data = res_batch3_day2_lfc_labelled,
    mapping = aes(label = gene, segment.colour = "black"),
    min.segment.length = 0,
    max.overlaps = 30,
    size = 2.5,
    show.legend = FALSE
  ) +
  theme_bw()
Warning: Removed 6685 rows containing missing values (geom_point).

Counts of exogenous rna

Plot of counts for a single gene (with lowest adjusted p-value)

Code
plot_list <- list()
for (gene in res_batch3_day2_lfc_labelled$gene) {
  gene_rna_type <- str_split(gene, ":")[[1]][1]
  exogenous_rna <- rna_mixes %>%
    dplyr::filter(rna_species == gene_rna_type) %>%
    pull(exogenous_rna)
  exogenous_rna_regex <- paste(exogenous_rna, collapse = "|")

  d <- plotCounts(dds_batch3_day2,
    gene = gene,
    intgroup = "cell_line",
    returnData = TRUE
  )
  p <- ggplot(
    d %>%
      rownames_to_column("sample_unit") %>%
      dplyr::filter(grepl(exogenous_rna_regex, sample_unit)),
    aes(x = cell_line, y = count)
  ) +
    geom_point(position = position_jitter(width = 0.1, height = 0)) +
    ggtitle(gene)
  plot_list[[gene]] <- p
}
cowplot::plot_grid(plotlist = plot_list, ncol = 3)